*This file loads in EPI CPS data, sets restrictions, cleans and creates
*variables, merges in commuting zones, and saves.
*Author: Teresa Kroeger
*----------------------------------------------------------------------------
	
********************************************************************************
* If used, cite: Economic Policy Institute. 2020. Current Population Survey Extracts, Version 1.0.10, https://microdata.epi.org.
* The data are available for download here: https://microdata.epi.org/epi_cpsorg_1979_2021.zip.
********************************************************************************

	* Download EPI CPS data and unzip.
	copy "https://microdata.epi.org/epi_cpsorg_1979_2023.zip" "$data/cps/input/epi_cpsorg_1979_2023.zip", replace
	cd "$data/cps/input"
	unzipfile "$data/cps/input/epi_cpsorg_1979_2023.zip"
	cd "$code"
	
	* remove unused EPI CPS files
	forvalues year = 1979/2012 {
		erase "$data/cps/input/epi_cpsorg_`year'.dta"
	}
	
	erase "$data/cps/input/epi_cpsorg_1979_2023.zip"

	* install the command load_epiextracts

	net install load_epiextracts, from("https://microdata.epi.org/stata")
	
	local startdate 2013m1
	local enddate 2023m8
	

	* Load all of the EPI CPS CPS ORG
	load_epiextracts, begin(`startdate') end(`enddate') sample(org) sourcedir("$data/cps/input")

********************************************************************************
* Define Sample Restrictions
********************************************************************************

	gen insample=1
	replace insample=0 if age<16 | age>65
	replace insample=0 if emp!=1
	replace insample=0 if selfemp==1
	replace insample=0 if selfinc==1
	replace insample=0 if paidhre==.
	replace insample=0 if hoursu1i<=3 | hoursu1i==. // usually works more than 3 hours per week
	
	tab age if insample

********************************************************************************
* Create month-year variable
********************************************************************************

	gen mdate=ym(year,month)
	format mdate %tm	
	
********************************************************************************
* Cleaning
********************************************************************************

	gen lths=.
	replace lths=1 if educ==1
	replace lths=0 if educ>=2 & educ<=6
	la var lths "Less than high school education"
	tab lths educ
	
	gen hs_less=.
	replace hs_less=1 if educ==1 | educ==2
	replace hs_less=0 if educ>=3 & educ<=6
	la var hs_less "High school degree or less"
	tab hs_less educ
	
	* Recode parttime
	gen ftpt_hrs=.
	replace ftpt_hrs=1 if hoursu1i>=35 & hoursu1i<=99
	replace ftpt_hrs=2 if hoursu1i>=1 & hoursu1i<=34
	la define lftpt_hrs 1 "Full-time (>=35 hrs/wk)" 2 "Part-time (1-34 hrs/wk)", modify
	label val ftpt_hrs lftpt_hrs
	la var ftpt_hrs "Status based on usual weekly hours in main job, with imputations (hoursu1i)"
	
	gen fullpart=ftpt_hrs
	
	gen hours=hoursu1i
	replace hours=. if hoursu1i==0 | hoursu1i==999
	tab hours

	* Crosswalk CPS industry codes with NAICS 2-digit codes
	// This is the code for getting NAICS codes. I ran it on Hourly Workers Ind_Region.dta and it works. We will have to use naics_v2 to match it with sector.

	gen ind= ind12 if inrange(year,2014,2019) 
	* different ind codes from 2020 onward
	
	gen naics = .

	replace naics = 11  if ind >= 170 & ind <= 290

	replace naics = 21 if ind >= 370 & ind <= 490

	replace naics = 22 if ind >= 570 & ind <= 690

	replace naics = 23 if ind == 770

	replace naics = 31 if ind >= 1070 & ind <= 1790

	replace naics = 32 if (ind >= 1870 & ind <= 2390) | (ind >= 2470 & ind <= 2590)

	replace naics = 33 if (ind >= 2670 & ind <= 2990) | (ind >= 3070 & ind <= 3690)

	replace naics = 42 if ind >= 4070 & ind <= 4590

	replace naics = 44 if ind >= 4670 & ind <= 5190

	replace naics = 45 if ind >= 5275 & ind <= 5790

	replace naics = 48 if ind >= 6070 & ind <= 6290

	replace naics = 49 if ind >= 6370 & ind <= 6390

	replace naics = 51 if ind >= 6470 & ind<= 6780

	replace naics = 52 if ind >= 6870 & ind <= 6990

	replace naics = 53 if ind >= 7070 & ind <= 7190

	replace naics = 54 if ind >= 7270 & ind <= 7490

	replace naics = 55 if ind == 7570

	replace naics = 56 if ind >= 7580 & ind <= 7790

	replace naics = 61 if ind >= 7860 & ind <= 7890

	replace naics = 62 if ind >= 7970 & ind <= 8470

	replace naics = 71 if ind >= 8560 & ind <= 8590

	replace naics = 72 if ind >= 8660 & ind <= 8990

	replace naics = 81 if ind >= 8770 & ind <= 9290

	replace naics = 92 if ind >= 9370 & ind <= 9590


	gen naics_v2 = string(naics)

	replace naics_v2 = "31-33" if naics >= 31 & naics <= 33

	replace naics_v2 = "44-45" if naics >= 44 & naics <= 45

	replace naics_v2 = "48-49" if naics >= 48 & naics <= 49


	* Crosswalk CPS occupation codes with SOC 2-digit codes
	gen occ= occ10 if inrange(year,2014,2019) 
	* different ind codes from 2020 onward
	
	gen soc = .

	replace soc = 11  if occ >= 10 & occ <= 440

	replace soc = 13 if occ >= 500 & occ <= 960

	replace soc = 15 if occ >= 1005 & occ <= 1240

	replace soc = 17 if occ >= 1305 & occ <= 1560

	replace soc = 19 if occ >= 1600 & occ <= 1980

	replace soc = 21 if occ >= 2001 & occ <= 2060

	replace soc = 23 if occ >= 2100 & occ <= 2180

	replace soc = 25 if occ >= 2205 & occ <= 2555

	replace soc = 27 if occ >= 2600 & occ <= 2970

	replace soc = 29 if occ >= 3000 & occ<= 3550

	replace soc = 31 if occ >= 3601 & occ <= 4655

	replace soc = 33 if occ >= 3700 & occ <= 3960

	replace soc = 35 if occ >= 4000 & occ <= 4160

	replace soc = 37 if occ >= 4200 & occ <= 4255

	replace soc = 39 if occ >= 4330 & occ <= 4655

	replace soc = 41 if occ >= 4700 & occ <= 4965

	replace soc = 43 if occ >= 5000 & occ <= 5940

	replace soc = 45 if occ >= 6005 & occ <= 6130

	replace soc = 47 if occ >= 6200 & occ <= 6950

	replace soc = 49 if occ >= 7000 & occ <= 7640

	replace soc = 51 if occ >= 7700 & occ <= 8990

	replace soc = 53 if occ >= 9005 & occ <= 9760

	replace soc = 55 if occ >= 9800 & occ <= 9830

	* Ind
	gen retail=0
	replace retail=1 if ind90>=580 & ind90<700
	la var retail "Retail Trade industry"

	gen wtran=0
	replace wtran=1 if ind90>=400 & ind90<=432
	la var wtran "Transportation and Warehousing industry"

		
	* Create a harmonized code for number of years of schooling, and potential experience
	
	* a.create schooling variable
	gen 	schooling = .
	replace schooling = 0 	if gradeatn==1
	replace schooling = 4 	if gradeatn==2
	replace schooling = 6 	if gradeatn==3
	replace schooling = 8 	if gradeatn==4
	replace schooling = 9 	if gradeatn==5
	replace schooling = 10 	if gradeatn==6
	replace schooling = 11 	if gradeatn==7
	replace schooling = 12 	if gradeatn==8
	replace schooling = 12 	if gradeatn==9
	replace schooling = 13 	if gradeatn==10
	replace schooling = 14 	if gradeatn==11
	replace schooling = 14 	if gradeatn==12
	replace schooling = 16 	if gradeatn==13
	replace schooling = 18 	if gradeatn==14
	replace schooling = 18 	if gradeatn==15
	replace schooling = 21 	if gradeatn==16

	* b. Create "potential experience" (see DiNardo, Fortin, Lemieux (1996) = age - schooling - 5). Alternatively, see (Mincer (1974) and Card (1999), Handbook of Labor Economics = age - schooling - 6)
	gen exp = .
	replace exp = max(0,age - schooling - 5)
	gen exp_square = .
	replace exp_square = exp^2
	gen exp_cubic = .
	replace exp_cubic = exp^3
	gen age_square = .
	replace age_square = age^2
	gen age_cubic = .
	replace age_cubic = age^3
	label var exp "potential experience"
	label var exp_square "potential experience, square"
	label var exp_cubic "potential experience, cubic"  	
	label var age_square "age, square"
	label var age_cubic "age, cubic"
	
	* Create degree variable
	gen degree=.
	replace degree=6 if gradeatn==16
	replace degree=5 if gradeatn==14 | gradeatn==15
	replace degree=4 if gradeatn==13
	replace degree=3 if gradeatn==11 | gradeatn==12
	replace degree=2 if gradeatn==9 | gradeatn==10
	replace degree=1 if gradeatn<=8
	label define ldegree 1 "Less than High School" 2 "High School" 3 "Associate's" 4 "Bachelor's"  5 "Master's or Professional" 6 "PhD" 
	label val degree ldegree 
	tab degree
	tab gradeatn degree 
/*

	* Identify and remove overtime payments
	gen worksot=.
	replace worksot=1 if hoursuorg>40 & paidhre==1 & otcrec==1 & otcamt>0 & otcamt!=. 
	la var worksot "Works OT if hourly worker with usual hours over 40"
		tab worksot if insample
		sum otcamt if insample
		
	gen othours=.
	replace othours=(hoursuorg-40) if worksot==1 
	la var othours "Number of weekly hours over 40 if OT worker"
		tab othours if insample
		sum othours if insample

	gen otpay=.
	replace otpay=(wage * othours) if worksot==1 //should it be time and a half? Err on highest wage possible to undercount violations
		// Should previous line be wageotc?
	replace otpay=otcamt if (otcamt-otpay<0)
	la var otpay "Weekly pay for hours over 40 if OT worker" 
		tab otpay if insample
		sum otpay if insample
		
	gen weekly_wage_tc=weekpay
	replace weekly_wage_tc= (weekpay - otpay) if worksot==1 
	la var weekly_wage_tc "Weekly pay for all workers incl tips and commissions, excl overtime"
		sum weekpay if insample, d
		sum weekly_wage_tc if insample, d
		*browse hoursuorg othours otcamt otpay weekpay weekly_wage_tc if worksot==1 & insample
		
	gen hourly_wage_tc=(weekly_wage_tc/hoursu1i)
	replace hourly_wage_tc=. if hourly_wage_tc<1
	la var hourly_wage_tc "Hourly rate for all workers incl tips and commissions, excl overtime"
		sum wageotc if insample, d
		sum hourly_wage_tc if insample, d
	
	*replace insample=0 if hourly_wage_tc==.
*/

******************************************************************************
		
	* Create wage variables
	** Weekly Pay (includes OTC)
	gen uwe=weekpay
	la var uwe "Usual weekly earnings, top-coded"
	
/*
	*** Drop OT workers
	replace uwe=. if worksot==1
	kdensity uwe, name(k1, replace)
*/
	

	** Step 1: For hourly workers, use hourly wage 
	gen wage2=.
	
	replace wage2=earnhour if paidhre==1

	** Step 1: Use Usual Weekly Hours	
		** hoursu1: Usual hours worked per week, main job
	replace wage2=(uwe/hoursu1) if uwe!=. & hoursu1!=. & paidhre==0
	
	la var wage2 "Hourly wage"
	kdensity wage2, name(w2a, replace)
	
	** Step 2: Winsorize hourly wage variable
	sum wage2, d
	replace wage2=. if wage2<`r(p5)' | wage2>`r(p95)'
	sum wage2, d
	kdensity wage2, name(w2b, replace)
	
	** Step 3: Log
	gen lnwage2=ln(wage2) if wage2!=. 
	la var lnwage2 "Log of hrly wage"
	kdensity wage2, name(w2c, replace)
		
********************************************************************************
* Merge in commuting zones
********************************************************************************
	
	* Prep state fips
	gen str3 statefips_str = string(statefips,"%02.0f")
	
	* Prep county fips
	gen str3 countyfips_str = string(countyfips,"%03.0f")
	replace countyfips_str="" if countyfips_str=="000"
	
	* Create complete county fips code
	egen cty_fips_str=concat(statefips_str countyfips_str) if statefips_str!="" & countyfips_str!=""
		tab cty_fips_str
	destring cty_fips_str, gen(cty_fips)
	replace cty_fips=12025 if cty_fips==12086 // From https://www.ddorn.net/data/FIPS_County_Code_Changes.pdf: Florida, 1997: Dade county (FIPS 12025) is renamed as Miami-Dade county (FIPS 12086). Action: replace FIPS code 12086 with the old code 12025.
	
	* Merge in commuting zones
	merge m:1 cty_fips using "$data/crosswalks/cw_cty_czone.dta"
	
	tab cty_fips if _merge==1
	/*
	   cty_fips |      Freq.     Percent        Cum.
------------+-----------------------------------
       8014 |          6        0.06        0.06
      12086 |     10,549       99.94      100.00
------------+-----------------------------------
      Total |     10,555      100.00
	  
From https://www.ddorn.net/data/FIPS_County_Code_Changes.pdf: Colorado, 2001: Broomfield county (FIPS 8014) is created out of parts of Adams, Boulder, Jefferson, and Weld counties. According to Wikipedia, Boulder county lost about 40,000 inhabitants to Broomfield county in 2001. In turn, Broomfield's website indicates a population of 42,169 for 2003  which suggests that most of Broomfield's population came from Boulder county. Action: assign FIPS code 8014 to CZ 28900 which comprises Boulder county (as well as Adams and Jefferson counties).	  
	  */
	  
	replace czone=28900 if cty_fips==8014
	drop if _merge==2
	drop _merge  
	
********************************************************************************
* Merge in data on applicable minimum wages
********************************************************************************

/*
* Merge in state and local minimum wages
do "$cps/documentation/MinimumWages.do"
	
* Code FLSA exemptions for workers' applicable minimum wage
do "$cps/documentation/FLSAExemptions.do"
*/
	
********************************************************************************

save "$data/cps/output/cpsorg_2013-2023.dta", replace

*END
